Data Cleaning Project

Streamlining global layoff data for accurate insights.

SQL Code


# Data Cleaning Project

Select *
from layoffs;

Select company, percentage_laid_off
from layoffs
order by percentage_laid_off desc
;

# 1. Remove Duplicates
# 2. Standardize the Data
# 3. Null & Blank Values
# 4. Remove Unnecessary Columns or Rows

Create Table layoffs_staging
Like layoffs;

Select *
from layoffs_staging;

Insert layoffs_staging
Select *
From layoffs;

Select *,
ROW_NUMBER () OVER (Partition By company, location, industry, total_laid_off, percentage_laid_off, `date`, stage, country, funds_raised_millions) as Row_Num
from layoffs_staging;

With duplicate_cte as
(
Select *,
ROW_NUMBER () OVER (Partition By company, location, industry, total_laid_off, percentage_laid_off, `date`, stage, country, funds_raised_millions) as Row_Num
from layoffs_staging
)
Select *
From duplicate_cte
Where Row_Num > 1;

# Deleting Duplicates using Create Table
CREATE TABLE `layoffs_staging_2` (
  `company` text,
  `location` text,
  `industry` text,
  `total_laid_off` int DEFAULT NULL,
  `percentage_laid_off` text,
  `date` text,
  `stage` text,
  `country` text,
  `funds_raised_millions` int DEFAULT NULL,
  `Row_Num` int
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;

INSERT INTO layoffs_staging_2
Select *,
ROW_NUMBER () OVER (Partition By company, location, industry, total_laid_off, percentage_laid_off, `date`, stage, country, funds_raised_millions) as Row_Num
from layoffs_staging;

Select *
From layoffs_staging_2
WHERE Row_Num > 1;

SET SQL_SAFE_UPDATES = 0;

DELETE 
FROM layoffs_staging_2
WHERE Row_Num > 1;

# STANDARDIZING DATA

SELECT DISTINCT (TRIM(company))
FROM layoffs_staging_2;

UPDATE layoffs_staging_2
SET company = (TRIM(company));

SELECT DISTINCT industry
FROM layoffs_staging_2
ORDER BY 1;

SELECT *
FROM layoffs_staging_2
WHERE industry like '%crypto%';

UPDATE layoffs_staging_2
SET industry = 'Crypto'
WHERE industry like '%crypto%';

SELECT DISTINCT COUNTRY, (TRIM(TRAILING '.' FROM country))
FROM layoffs_staging_2
ORDER BY 1;

UPDATE layoffs_staging_2
SET country = TRIM(TRAILING '.' FROM country)
WHERE country like '%United States%';

SELECT DISTINCT COUNTRY
FROM layoffs_staging_2
ORDER BY 1;

SELECT `date`, str_to_date(`date`, '%m/%d/%Y')
FROM layoffs_staging_2;

UPDATE layoffs_staging_2
SET `date` = str_to_date(`date`, '%m/%d/%Y');

ALTER TABLE layoffs_staging_2
MODIFY COLUMN `date` date;

SELECT *
FROM layoffs_staging_2;

SELECT *
FROM layoffs_staging_2
WHERE total_laid_off IS NULL
AND percentage_laid_off IS NULL;

SELECT *
FROM layoffs_staging_2
WHERE industry IS NULL
OR industry = ('');

SELECT *
FROM layoffs_staging_2
WHERE company = 'Airbnb';

UPDATE layoffs_staging_2
SET industry = 'Travel'
WHERE company = 'Airbnb';

UPDATE layoffs_staging_2
SET industry = NULL
WHERE industry = '';

SELECT ind1.industry, ind2.industry
FROM layoffs_staging_2 AS ind1
JOIN layoffs_staging_2 AS ind2
	ON ind1.company = ind2.company
    AND ind1.location = ind2.location
WHERE (ind1.industry IS NULL OR ind1.industry = '')
AND (ind2.industry IS NOT NULL OR ind2.industry != '');

UPDATE layoffs_staging_2 AS ind1
JOIN layoffs_staging_2 AS ind2
	ON ind1.company = ind2.company
    AND ind1.location = ind2.location
SET ind1.industry = ind2.industry
WHERE (ind1.industry IS NULL OR ind1.industry = '')
AND (ind2.industry IS NOT NULL OR ind2.industry != '');

SELECT *
FROM layoffs_staging_2
WHERE company LIKE 'Bally%';

SELECT *
FROM layoffs_staging_2
WHERE total_laid_off IS NULL
AND percentage_laid_off IS NULL;

DELETE
FROM layoffs_staging_2
WHERE total_laid_off IS NULL
AND percentage_laid_off IS NULL;

ALTER TABLE layoffs_staging_2
DROP COLUMN Row_Num;

SELECT *
FROM layoffs_staging_2;

                            

Exploratory Data Analysis Project

Uncovering trends and patterns in global layoff data.

SQL Code


# Exploratory Data Analysis

SELECT *
FROM layoffs_staging_2;

SELECT MAX(total_laid_off), MAX(percentage_laid_off)
FROM layoffs_staging_2;

SELECT *
FROM layoffs_staging_2
WHERE percentage_laid_off = 1
ORDER BY total_laid_off desc;

SELECT company, SUM(total_laid_off)
FROM layoffs_staging_2
GROUP BY company
ORDER BY 2 DESC;

SELECT MIN(`DATE`), MAX(`DATE`)
FROM layoffs_staging_2;

SELECT industry, SUM(total_laid_off)
FROM layoffs_staging_2
GROUP BY industry
ORDER BY 2 DESC;

SELECT country, SUM(total_laid_off)
FROM layoffs_staging_2
GROUP BY country
ORDER BY 2 DESC;

SELECT YEAR (`date`), SUM(total_laid_off)
FROM layoffs_staging_2
GROUP BY YEAR (`date`)
ORDER BY 2 DESC;

SELECT stage, SUM(total_laid_off)
FROM layoffs_staging_2
GROUP BY stage
ORDER BY 2 DESC;

# ROLLING LAYOFFS BY MONTH
SELECT SUBSTRING(`date`,1,7) AS `MONTH`, SUM(total_laid_off)
FROM layoffs_staging_2
WHERE SUBSTRING(`date`,1,7) IS NOT NULL
GROUP BY `MONTH`
ORDER BY 1 ASC;

WITH Rolling_Total AS 
(
SELECT SUBSTRING(`date`,1,7) AS `MONTH`, SUM(total_laid_off) AS `Total_Off`
FROM layoffs_staging_2
WHERE SUBSTRING(`date`,1,7) IS NOT NULL
GROUP BY `MONTH`
ORDER BY 1 ASC
)
SELECT `MONTH`, Total_Off, SUM(Total_Off) OVER(ORDER BY `MONTH`) AS rolling_Total
FROM Rolling_Total;

WITH Rolling_Total AS 
(
SELECT company, SUBSTRING(`date`,1,4) AS `YEAR`, SUM(total_laid_off) AS `Total_Off`
FROM layoffs_staging_2
WHERE SUBSTRING(`date`,1,4) IS NOT NULL
GROUP BY company, `YEAR`
ORDER BY 3 DESC
)
SELECT company, `YEAR`, Total_Off, SUM(Total_Off) OVER(ORDER BY `YEAR`) AS rolling_Total
FROM Rolling_Total;

SELECT company, SUM(total_laid_off)
FROM layoffs_staging_2
GROUP BY company
ORDER BY 2 DESC;

SELECT company, YEAR(`date`), SUM(total_laid_off)
FROM layoffs_staging_2
GROUP BY company, YEAR(`date`)
ORDER BY 3 DESC;

WITH Company_Year (company, years, total_laid_off) AS 
(
SELECT company, YEAR(`date`), SUM(total_laid_off)
FROM layoffs_staging_2
WHERE YEAR(`date`) IS NOT NULL
GROUP BY company, YEAR(`date`)
)
SELECT *, DENSE_RANK() OVER (PARTITION BY years ORDER BY total_laid_off DESC) AS Ranking
FROM Company_Year
ORDER BY Ranking ASC;

WITH Company_Year (company, years, total_laid_off) AS 
(
SELECT company, YEAR(`date`), SUM(total_laid_off)
FROM layoffs_staging_2
WHERE YEAR(`date`) IS NOT NULL
GROUP BY company, YEAR(`date`)
),
Company_Year_Rank AS 
(SELECT *, DENSE_RANK() OVER (PARTITION BY years ORDER BY total_laid_off DESC) AS Ranking
FROM Company_Year
ORDER BY years ASC
)
SELECT *
FROM (Company_Year_Rank)
WHERE RANKING <= 5;